{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1 Download and prepare data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = 'PATH_TO_THE_DATA_DIR'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This example is for demonstration purposes\n",
    "# Please refer to the corresponding NLP tutorial on NeMo documentation\n",
    "! bash get_wkt2.sh $DATA_DIR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# verify data is there \n",
    "! ls -l $DATA_DIR/wikitext-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare tokenization model\n",
    "! python create_vocab.py --train_path=$DATA_DIR/wikitext-2/train.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2 - import necessary packages, define hyperparameters, create tokenizer instance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import torch\n",
    "import nemo\n",
    "from nemo import logging\n",
    "\n",
    "import nemo.collections.nlp as nemo_nlp\n",
    "from nemo.collections.nlp.callbacks.lm_bert_callback import eval_iter_callback, \\\n",
    "    eval_epochs_done_callback\n",
    "from nemo.utils.lr_policies import CosineAnnealing\n",
    "\n",
    "BATCHES_PER_STEP = 1\n",
    "BATCH_SIZE = 64\n",
    "BATCH_SIZE_EVAL = 16\n",
    "D_MODEL = 768\n",
    "D_INNER = 3072\n",
    "HIDDEN_ACT = \"relu\"\n",
    "LEARNING_RATE = 0.0001\n",
    "LR_WARMUP_PROPORTION = 0.05\n",
    "MASK_PROBABILITY = 0.15\n",
    "MAX_SEQ_LENGTH = 128\n",
    "NUM_EPOCHS = 1\n",
    "NUM_HEADS = 12\n",
    "# Note that for Demo purposes this is set to just one epoch\n",
    "NUM_LAYERS = 1\n",
    "OPTIMIZER = \"adam_w\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instantiate neural factory with supported backend\n",
    "neural_factory = nemo.core.NeuralModuleFactory(\n",
    "    backend=nemo.core.Backend.PyTorch,\n",
    "\n",
    "    # If you're training with multiple GPUs, you should handle this value with\n",
    "    # something like argparse. See examples/nlp/bert_pretraining.py for an example.\n",
    "    local_rank=None,\n",
    "\n",
    "    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.\n",
    "    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.\n",
    "    optimization_level=nemo.core.Optimization.mxprO1,\n",
    "\n",
    "    # If you're training with multiple GPUs, this should be set to\n",
    "    # nemo.core.DeviceType.AllGpu\n",
    "    placement=nemo.core.DeviceType.GPU)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tokenizer.model file was created during Step 1\n",
    "tokenizer = nemo_nlp.data.SentencePieceTokenizer(model_path=\"tokenizer.model\")\n",
    "special_tokens = nemo_nlp.data.get_bert_special_tokens('bert')\n",
    "tokenizer.add_special_tokens(special_tokens)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Instantiate necessary neural modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
    "    vocab_size=tokenizer.vocab_size,\n",
    "    num_hidden_layers=NUM_LAYERS,\n",
    "    hidden_size=D_MODEL,\n",
    "    num_attention_heads=NUM_HEADS,\n",
    "    intermediate_size=D_INNER,\n",
    "    max_position_embeddings=MAX_SEQ_LENGTH,\n",
    "    hidden_act=HIDDEN_ACT\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Masked Language Modeling Loss\n",
    "mlm_classifier = nemo_nlp.nm.trainables.BertTokenClassifier(D_MODEL,\n",
    "                                          num_classes=tokenizer.vocab_size,\n",
    "                                              activation=HIDDEN_ACT,\n",
    "                                          log_softmax=True)\n",
    "mlm_loss = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss()\n",
    "\n",
    "# Next Sentence Prediciton Loss\n",
    "nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n",
    "                                             num_classes=2,\n",
    "                                             num_layers=2,\n",
    "                                             activation='tanh',\n",
    "                                             log_softmax=False)\n",
    "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLossNM()\n",
    "\n",
    "bert_loss = nemo.backends.pytorch.common.LossAggregatorNM(num_inputs=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
    "    tokenizer=tokenizer,\n",
    "    dataset=os.path.join(DATA_DIR, \"wikitext-2\", \"train.txt\"),\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
    "    mask_probability=MASK_PROBABILITY,\n",
    "    batch_size=BATCH_SIZE,\n",
    "    shuffle=True\n",
    ")\n",
    "\n",
    "eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
    "    tokenizer=tokenizer,\n",
    "    dataset=os.path.join(DATA_DIR, \"wikitext-2\", \"valid.txt\"),\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
    "    mask_probability=MASK_PROBABILITY,\n",
    "    batch_size=BATCH_SIZE_EVAL,\n",
    "    shuffle=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3 - Describe training and evaluation DAGs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Training DAG\n",
    "input_data = train_data_layer()\n",
    "\n",
    "hidden_states = bert_model(input_ids=input_data.input_ids,\n",
    "                           token_type_ids=input_data.input_type_ids,\n",
    "                           attention_mask=input_data.input_mask)\n",
    "\n",
    "mlm_logits = mlm_classifier(hidden_states=hidden_states)\n",
    "t_mlm_loss = mlm_loss(logits=mlm_logits, labels=input_data.output_ids, output_mask=input_data.output_mask)\n",
    "\n",
    "nsp_logits = nsp_classifier(hidden_states=hidden_states)\n",
    "t_nsp_loss = nsp_loss(logits=nsp_logits, labels=input_data.labels)\n",
    "\n",
    "loss = bert_loss(loss_1=t_mlm_loss, loss_2=t_nsp_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluation DAG\n",
    "input_data_eval = eval_data_layer()\n",
    "\n",
    "e_hidden_states = bert_model(input_ids=input_data_eval.input_ids,\n",
    "                           token_type_ids=input_data_eval.input_type_ids,\n",
    "                           attention_mask=input_data_eval.input_mask)\n",
    "\n",
    "e_mlm_logits = mlm_classifier(hidden_states=e_hidden_states)\n",
    "e_mlm_loss = mlm_loss(logits=e_mlm_logits, labels=input_data_eval.output_ids, output_mask=input_data_eval.output_mask)\n",
    "\n",
    "e_nsp_logits = nsp_classifier(hidden_states=e_hidden_states)\n",
    "e_nsp_loss = nsp_loss(logits=e_nsp_logits, labels=input_data_eval.labels)\n",
    "\n",
    "e_loss = bert_loss(loss_1=e_mlm_loss, loss_2=e_nsp_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "callback_loss = nemo.core.SimpleLossLoggerCallback(\n",
    "    tensors=[loss],\n",
    "    print_func=lambda x: logging.info(\"Loss: {:.3f}\".format(x[0].item())))\n",
    "\n",
    "train_data_size = len(train_data_layer)\n",
    "\n",
    "# If you're training on multiple GPUs, this should be\n",
    "# train_data_size / (batch_size * batches_per_step * num_gpus)\n",
    "steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))\n",
    "\n",
    "callback_eval = nemo.core.EvaluatorCallback(\n",
    "    eval_tensors=[e_mlm_loss, e_nsp_loss],\n",
    "    user_iter_callback=eval_iter_callback,\n",
    "    user_epochs_done_callback=eval_epochs_done_callback,\n",
    "    eval_step=steps_per_epoch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr_policy = CosineAnnealing(NUM_EPOCHS * steps_per_epoch,\n",
    "                            warmup_ratio=LR_WARMUP_PROPORTION)\n",
    "neural_factory.train(tensors_to_optimize=[loss],\n",
    "                lr_policy=lr_policy,\n",
    "                callbacks=[callback_loss, callback_eval],\n",
    "                batches_per_step=BATCHES_PER_STEP,\n",
    "                optimizer=OPTIMIZER,\n",
    "                optimization_params={\n",
    "                    \"batch_size\": BATCH_SIZE,\n",
    "                    \"num_epochs\": NUM_EPOCHS,\n",
    "                    \"lr\": LEARNING_RATE,\n",
    "                    \"betas\": (0.95, 0.98),\n",
    "                    \"grad_norm_clip\": None\n",
    "                })"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
